In [1]:
%pylab inline
In [62]:
import nltk
from tethne.readers import zotero
import matplotlib.pyplot as plt
from helpers import normalize_token, filter_token
import networkx as nx
In [5]:
text_root = '../data/SystemsBiology/'
documents = nltk.corpus.PlaintextCorpusReader(text_root, '.+.txt')
In [28]:
tagged_words = nltk.pos_tag(documents.words())
In [29]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_words(tagged_words)
In [30]:
def filter_taggedtoken(tagged_token):
token, tag = tagged_token
return token.isalpha() and tag.startswith('N')
In [31]:
finder.apply_freq_filter(3)
finder.apply_word_filter(lambda token: not filter_taggedtoken(token))
finder.nbest(bigram_measures.pmi, 10)
Out[31]:
In [33]:
finder.nbest(bigram_measures.likelihood_ratio, 20)
Out[33]:
In [34]:
ngram, score = zip(*[g for g in finder.score_ngrams(bigram_measures.pmi)])
In [35]:
plt.hist(score)
plt.show()
In [37]:
selected_ngrams = [ngram for ngram in finder.above_score(bigram_measures.pmi, 0.0)]
selected_ngrams[-20:]
Out[37]:
In [38]:
selected_expressions = []
for (token1, pos1), (token2, pos2) in finder.above_score(bigram_measures.pmi, 1.0):
selected_expressions.append((token1, token2))
selected_expressions[:10]
Out[38]:
In [39]:
tokenizer = nltk.MWETokenizer(selected_expressions)
In [41]:
words_with_bigrams = [normalize_token(token).replace('_', ' ')
for fileid in documents.fileids()
for token in tokenizer.tokenize(documents.words(fileids=[fileid]))
if filter_token(token) or '_' in token]
In [49]:
words_with_bigrams[20:30]
Out[49]:
In [50]:
finder = nltk.collocations.BigramCollocationFinder.from_words(words_with_bigrams, window_size=10)
finder.apply_freq_filter(2)
finder.apply_word_filter(lambda token: not filter_token(token))
In [51]:
finder.nbest(bigram_measures.pmi, 20)
Out[51]:
In [53]:
pair, score = zip(*finder.score_ngrams(bigram_measures.pmi))
In [54]:
plt.hist(score)
plt.show()
In [103]:
graph = nx.Graph()
for (token1, token2), score in finder.score_ngrams(bigram_measures.pmi):
if score > 8. and token1 != token2:
graph.add_edge(token1, token2, weight=score)
In [104]:
# Number of nodes, number of edges.
graph.order(), graph.size(), nx.number_connected_components(graph)
Out[104]:
In [105]:
nx.write_graphml(graph, 'colocation.graphml')
In [ ]: